{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "GPU_id = 2\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from time import time \n",
    "\n",
    "from fastai import *\n",
    "from fastai.basic_data import *\n",
    "from fastai.basic_data import *\n",
    "from fastai.tabular import *\n",
    "from fastai.basic_data import DataBunch\n",
    "from fastai.tabular import TabularModel\n",
    "\n",
    "import cudf\n",
    "\n",
    "from preproc import *\n",
    "from batchloader import *\n",
    "from helpers import get_mean_reciprocal_rank, roc_auc_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- In this notebook we want to benchmark the processing and training time for three diffrent models: \n",
    "\n",
    "- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section I </a>:\n",
    "     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy\n",
    "     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    \n",
    "\n",
    "           \n",
    " - <a href=#third_model> Model 3 </a> : In the second <a href=#fastai_workflow> section II </a>, we are using the Fastai processing workflow to get the scores of the best model found in the section I.  We directly process and create databunch from data_pair_all.pkl dataframe "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**N.B** : For each model, you need to re-start the kernel to free the GPU memory and be able to run all the experiments "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext snakeviz\n",
    "# load snakeviz if you want to run profiling "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BatchDataBunch(DataBunch):\n",
    "    \n",
    "    @classmethod\n",
    "    def remove_tfm(cls, tfm:Callable)->None:\n",
    "        \"Remove `tfm` from `self.tfms`.\"\n",
    "        if tfm in cls.tfms: cls.tfms.remove(tfm)\n",
    "            \n",
    "    @classmethod\n",
    "    def add_tfm(cls,tfm:Callable)->None:\n",
    "        \"Add `tfm` to `self.tfms`.\"\n",
    "        cls.tfms.append(tfm)\n",
    "\n",
    "    \n",
    "    @classmethod\n",
    "    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, \n",
    "                      num_workers:int=defaults.cpus, device:torch.device=None,\n",
    "                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, \n",
    "                       size:int=None, **kwargs)->'BatchDataBunch':\n",
    "        \n",
    "        \n",
    "        cls.tfms = listify(tfms)\n",
    "        \n",
    "        \n",
    "        val_bs = ifnone(val_bs, bs)\n",
    "        \n",
    "        datasets = [TensorBatchDataset(train_ds, batch_size=bs), \n",
    "                    TensorBatchDataset(valid_ds, batch_size=bs)]\n",
    "        \n",
    "        if valid_ds is not None:\n",
    "            cls.empty_val = False\n",
    "        else:\n",
    "            cls.empty_val = True\n",
    "            \n",
    "        if test_ds is not None:\n",
    "            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))\n",
    "        else: \n",
    "            datasets.append(test_ds)\n",
    "        \n",
    "        cls.device = defaults.device if device is None else device\n",
    "        \n",
    "        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in\n",
    "               zip(datasets,(True,False,False)) if d is not None]\n",
    "\n",
    "        cls.path = path \n",
    "        \n",
    "        cls.dls = dls\n",
    "    \n",
    "        \n",
    "        \n",
    "        assert not isinstance(dls[0],DeviceDataLoader)\n",
    "        \n",
    "        \n",
    "        # load batch in device \n",
    "        \n",
    "        if test_ds is not None:\n",
    "            cls.train_dl, cls.valid_dl, cls.test_dl = dls\n",
    "        else: \n",
    "            cls.train_dl, cls.valid_dl = dls\n",
    "            \n",
    "            \n",
    "        cls.path = Path(path)\n",
    "        return cls\n",
    "    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- To use the new BatchDatabunch class, we have to build the following processed tensors ( using cudf)  : \n",
    "    - train : cat_tensor, cont_tensor, label_tensor \n",
    "    \n",
    "    - valid : cat_tensor, cont_tensor, label_tensor \n",
    "    \n",
    "    - test : cat_tensor, cont_tensor, label_tensor \n",
    "    \n",
    "- The size of vocaublary of each categorical variable need to be known "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1> <center>  <a id=fastai_workflow> Test of Tabular Learner with Fastai workflow </a></center> </h1>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- As the processing time is taking more than 6minutes and our purpose is to benchmark the best model using our proposed workflow against the Fastai workflow. We'll directly compute the scores of the Tabular model with batch size of 204800 and learning rate 0.09 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h3> <a id=third_model> Fastai model </a> </h3> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 4096*50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai import *\n",
    "from fastai.basic_data import *\n",
    "from fastai.basic_data import *\n",
    "from fastai.tabular import *\n",
    "from fastai.basic_data import DataBunch\n",
    "from batchloader import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = './parquet_data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 11.1 s, sys: 15.5 s, total: 26.6 s\n",
      "Wall time: 26.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import pandas as pd\n",
    "path = os.path.join(data_path,'data_pair_all.pkl' )\n",
    "ds = pd.read_pickle(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row_id</th>\n",
       "      <th>candidate_order</th>\n",
       "      <th>item_id</th>\n",
       "      <th>price</th>\n",
       "      <th>row_id_count</th>\n",
       "      <th>item_count</th>\n",
       "      <th>user_id</th>\n",
       "      <th>session_id</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>step</th>\n",
       "      <th>...</th>\n",
       "      <th>delta_last_viewed_item_step_interaction item deals</th>\n",
       "      <th>delta_last_viewed_item_timestamp_interaction item deals</th>\n",
       "      <th>price_rank</th>\n",
       "      <th>price_rank_norm</th>\n",
       "      <th>item_count_rank</th>\n",
       "      <th>item_count_rank_norm</th>\n",
       "      <th>count_item_user_id_session_id_rank</th>\n",
       "      <th>count_item_user_id_session_id_rank_norm</th>\n",
       "      <th>count_item_user_id_rank</th>\n",
       "      <th>count_item_user_id_rank_norm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>461</td>\n",
       "      <td>2</td>\n",
       "      <td>1812701</td>\n",
       "      <td>44</td>\n",
       "      <td>25</td>\n",
       "      <td>177</td>\n",
       "      <td>9Z8H0R5BPH3H</td>\n",
       "      <td>b0d46e23f4544</td>\n",
       "      <td>1541072329</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>0.20</td>\n",
       "      <td>3</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>461</td>\n",
       "      <td>3</td>\n",
       "      <td>5164712</td>\n",
       "      <td>70</td>\n",
       "      <td>25</td>\n",
       "      <td>43</td>\n",
       "      <td>9Z8H0R5BPH3H</td>\n",
       "      <td>b0d46e23f4544</td>\n",
       "      <td>1541072329</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14</td>\n",
       "      <td>0.56</td>\n",
       "      <td>15</td>\n",
       "      <td>0.60</td>\n",
       "      <td>1</td>\n",
       "      <td>0.04</td>\n",
       "      <td>1</td>\n",
       "      <td>0.04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 46 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   row_id  candidate_order  item_id  price  row_id_count  item_count  \\\n",
       "0     461                2  1812701     44            25         177   \n",
       "1     461                3  5164712     70            25          43   \n",
       "\n",
       "        user_id     session_id   timestamp  step  \\\n",
       "0  9Z8H0R5BPH3H  b0d46e23f4544  1541072329     2   \n",
       "1  9Z8H0R5BPH3H  b0d46e23f4544  1541072329     2   \n",
       "\n",
       "               ...               \\\n",
       "0              ...                \n",
       "1              ...                \n",
       "\n",
       "   delta_last_viewed_item_step_interaction item deals  \\\n",
       "0                                                NaN    \n",
       "1                                                NaN    \n",
       "\n",
       "  delta_last_viewed_item_timestamp_interaction item deals price_rank  \\\n",
       "0                                                NaN               5   \n",
       "1                                                NaN              14   \n",
       "\n",
       "  price_rank_norm item_count_rank  item_count_rank_norm  \\\n",
       "0            0.20               3                  0.12   \n",
       "1            0.56              15                  0.60   \n",
       "\n",
       "   count_item_user_id_session_id_rank  \\\n",
       "0                                   0   \n",
       "1                                   1   \n",
       "\n",
       "   count_item_user_id_session_id_rank_norm  count_item_user_id_rank  \\\n",
       "0                                     0.00                        0   \n",
       "1                                     0.04                        1   \n",
       "\n",
       "   count_item_user_id_rank_norm  \n",
       "0                          0.00  \n",
       "1                          0.04  \n",
       "\n",
       "[2 rows x 46 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h3> Create pre-processed databunch </h3> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(42756036, 46) (5762533, 46)\n",
      "CPU times: user 7.85 s, sys: 8.46 s, total: 16.3 s\n",
      "Wall time: 16.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# split to train / test \n",
    "train = ds[ds.clickout_missing==0]\n",
    "test = ds[ds.clickout_missing>0]\n",
    "print(train.shape,test.shape)\n",
    "\n",
    "# get categorical and continious variables names \n",
    "cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')]\n",
    "cont_names = ['price','candidate_order'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]\n",
    "\n",
    "# define validation rows\n",
    "train['is_va'] = train.row_id%5 == 0\n",
    "del ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 31s, sys: 2min 40s, total: 6min 11s\n",
      "Wall time: 5min 58s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "procs = [FillMissing, Categorify, Normalize]\n",
    "\n",
    "test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)\n",
    "data = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)\n",
    "                           .split_from_df('is_va')\n",
    "                           .label_from_df(cols='target')\n",
    "                           .add_test(test_list)\n",
    "                           .databunch(num_workers=8,bs=batch_size, device='cuda'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h3> Compute validation scores of the best model  </h3> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Mean / std of scores : 5 runs \n",
    "aucs = []\n",
    "mrrs = []\n",
    "times = []\n",
    "best_bs = 4096*50\n",
    "best_lr = 9e-2\n",
    "\n",
    "emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), \n",
    "          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  \n",
    "\n",
    "# define the model\n",
    "model = TabularModel(emb_szs = emb_sz, n_cont=len(cont_names), out_sz=2, layers=[64, 32])\n",
    "model = model.cuda()\n",
    "learn =  Learner(data, model, metrics=None)\n",
    "learn.loss_func = torch.nn.CrossEntropyLoss()\n",
    "\n",
    "# train the model \n",
    "start = time()\n",
    "learn.fit_one_cycle(1, best_lr)\n",
    "tf = time()-start\n",
    "\n",
    "# get validation metrics \n",
    "yp,y_valid = learn.get_preds()\n",
    "cv = train.loc[train['is_va']>0,['row_id','reference','item_id', 'target']].copy()\n",
    "cv['prob'] = yp.numpy()[:,1]\n",
    "cv = cv.sort_values(by=['row_id','prob'],ascending=False)\n",
    "auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])\n",
    "mean_reciprocal_rank = get_mean_reciprocal_rank(cv)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the mrr of the best model is: 0.6111379476650328 \n",
      "the auc of the best mdodel is: 0.8753471591633394 \n",
      "the best mdodel's training time is 366.2465441226959 \n"
     ]
    }
   ],
   "source": [
    "print(\"the mrr of the best model is: %s \" %mean_reciprocal_rank)\n",
    "\n",
    "print(\"the auc of the best mdodel is: %s \" %auc)\n",
    "\n",
    "print(\"the best mdodel's training time is %s \" %tf)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
